/***
> Constructs the basis of the Compustat Annual Sample

File Inputs:
- funda.dta 

File Output:
- __compustat_cleaned.dta

***/

global dir your_path
cd "$dir"

use "$dir/funda.dta", clear


tab fyear
duplicates drop gvkey fyear, force	
destring gvkey, replace
sort gvkey fyear
xtset gvkey fyear
egen min = min(_n), by(gvkey)
egen max = max(_n), by(gvkey)
gen age = fyear[_n]-year(ipodate) + 1
replace age = (_n-min)+1 if (missing(ipodate) | age < 0)
drop min max

destring sic, replace

quietly do "FFind.ado"
quietly FFind
drop FF_49ind*
drop FF_12ind*

sort FF_48ind
//drop utilities
drop if FF_48ind == 31
//drop Banking, Insurance, Real Estate, Trading
drop if FF_48ind >= 44 & FF_48ind <= 47
drop if (sic >=6000 & sic < 7000) | (sic >= 3999 & sic < 5000)
sort gvkey fyear
gen mcap = log(prcc_f*csho)
gen leverage = (dltt + dlc)/at
replace leverage = 1 if leverage > 1 & !missing(leverage)
replace leverage = dltt/at if missing(leverage)
replace leverage = dlc/at if missing(leverage)
gen mktlev = (dltt+dlc)/(at-ceq + prcc_f*csho)
replace mktlev = (dltt )/(at-ceq + prcc_f*csho) if missing(mktlev)
replace mktlev = dlc/(at-ceq + prcc_f*csho) if missing(mktlev)
replace mktlev = 1 if mktlev > 1 & !missing(mktlev)

sort gvkey fyear
gen logSale = log(sale + 1)
gen logat = log(at) 
gen logcapx = log(capx + 1)
replace xrd = 0 if missing(xrd)
gen logrnd = log(xrd+1)
gen RD = xrd/l.at
gen ROA = ni/l.at
gen ROA2 = ebitda/l.at
replace ROA2 = ebit/l.at if missing(ROA2)
gen mtb = (at + prcc_f*csho - ceq)/at
gen DEincorp = (incorp == "DE")
gen tangibility = ppent/at 
gen MVD = prcc_f*csho/lt
replace wcap = 0 if missing(wcap)
gen zscore = 6.56*wcap/at + 3.26*(re/at) + 6.72*(ebit/at) + 1.05*(ceq/lt)
gen cf = (ib + dp)/l.at
replace cf = (ib)/l.at if missing(cf)
//replace cf = ebitda/l.at if missing(cf)
gen cash = (che)/at
replace cash = (ch + ivst)/at if missing(cash)
gen recap = rect/ap
replace recap = 0 if missing(rect)
replace recap = 1 if ap == 0 | missing(ap)
gen KS = ppent/sale
replace KS = 0 if ppent == 0
replace KS = 0 if sale <= 0 & missing(KS)
gen RDint = xrd/ppent
replace RDint = 0 if xrd == 0
replace xad = 0 if missing(xad)
gen ADint = xad/ppent
gen invrate = capx/ppent
replace invrate = 0 if capx == 0
replace invrate = 1 if (ppent == 0 | missing(ppent))
gen invrate2 = capx/ppegt
replace invrate2 = 0 if capx == 0
replace invrate2 = 1 if (ppegt == 0 | missing(ppegt))

gen Q = (at + prcc_f*csho - ceq)/at
gen inv = capx/l.at
replace xrd = 0 if missing(xrd)
gen rnd = xrd/l.at

sort gvkey fyear
gen eq_iss = (ceq-l.ceq - re + l.re)/l.at
replace eq_iss = 0 if missing(eq_iss)
label var eq_iss ///
"(ceq-l.ceq + txdb-l.txdb - re + l.re)/l.at"
//Debt issuance
gen debt_iss = (dltt-l.dltt + dlc - l.dlc)/l.at
replace debt_iss = 0 if missing(debt_iss)
label var debt_iss ///
"(dltt-l.dltt + dd1 - l.dd1 + np)/l.at"
drop if sale < 0
count
* 
drop if missing(at)
drop if missing(sale)
drop if missing(capx) 
drop if missing(ppent)
drop if missing(prcc_f) 
*/


xtset gvkey fyear
/*The following variables are used to construct the WW index*/
	gen CF = (ib + dp)/l.at
	label variable CF ///
	"Cash Flow = (ib + dp)/at[_n-1] defined in Titman"

	//DIVdummy = 1 if firm pays dividends
	gen DIVdum = (dv>0 & !missing(dv))
	label variable DIVdum ///
	"Dummy = 1 if firm pays cash dividends (dv>0 & !missing(dv))"

	//TLTD = long-term debt to total assets
	gen TLTD = dltt/at
	label var TLTD ///
	"Longterm debt to assets = dltt/at"

	//LNTA =  log total assets 
	gen LNTA = logat
	label var LNTA ///
	"Log of total assets" 

	//ISG = 3 digit sic industry sales growth
	//for now, I use compustat sic code, could also try crsp siccd
	destring sic, replace
	gen sic3 = int(sic/10)
	egen indsale = total(sale), by(sic3 fyear)
	gen ISG = indsale/L.indsale
	replace ISG = 0 if L.indsale == 0 
	
	//SG = firms sales growth
	gen SG = sale/l.sale
	replace SG = 0 if sale == 0 | l.sale == 0 
	
/*The next line calculates the actual index*/	
gen WW = -.091*CF - .062*DIVdum + .021*TLTD ///
- .044*LNTA + .102*ISG - .035*SG

sort gvkey fyear
gen logass = logat

replace logass = ln(4500) if at > 4500
gen SA = -.737*(logass) + .043*(logass^2) - .040*(age)
drop logass

gen LTD_DUE2 = l.dd1/(l.dltt + l.dlc)

#delimit ;
keep Q DEincorp logSale RD ROA ROA2 logat
mtb zscore tangibility cf cash inv mcap logcapx logrnd
gvkey fyear sic debt_iss leverage emp
eq_iss recap mktlev addzip state at FF_48ind 
KS cf RDint ADint invrate invrate2 age xrd capx sale WW SA 
oibdp xint prcc_c prcc_f csho lt txditc act lct dlc dltt
siv sppe sppiv sret do LTD_DUE2 ppent aqc dltt dd1 dvt dvpd teq seq txdbca che np;
#delimit cr 


gen D_leverage = D.leverage
gen netissue = debt_iss - eq_iss

foreach var of varlist gvkey fyear at  {
	drop if missing(`var')
} 

drop if fyear < 1985 | fyear > 2020

sort gvkey fyear
xtset gvkey fyear
foreach var of varlist at mktlev capx sale xrd age mcap leverage logSale ///
logat logcapx logrnd RD ROA mtb tangibility zscore cf cash invrate ///
Q inv eq_iss debt_iss WW {
	gen l_`var' = l.`var'
}

foreach var of varlist cash emp zscore {
	drop if missing(`var')
}	


duplicates drop gvkey fyear, force
sort gvkey fyear
xtset gvkey fyear

gen dropdum = 0
foreach var of varlist leverage logat logSale tangibility inv Q cf RD{
	replace dropdum = 1 if missing(`var')
	rename `var' raw`var'
	winsor raw`var' if !missing(raw`var'), gen(`var') p(.01)
	drop raw`var'
}
 drop if dropdum == 1
foreach var of varlist ROA cash zscore mtb eq_iss debt_iss l_at-l_debt_iss WW {
	rename `var' raw`var'
	winsor raw`var' if !missing(raw`var'), gen(`var') p(.01)
	drop raw`var'
}
 drop dropdum

gen year=fyear
rename l_WW l_WW_old

*** Merge additional variables (requested later on)

merge 1:1 gvkey year using "TextBasedConstraintsDatabase.dta", keep(1 3) 
drop _merge

*** Add data on accounts payable and receivable: 
merge 1:1 gvkey fyear using "firms_AP_AR.dta", keepusing(ap rect rectr recco recd cogs invch) keep(1 3) 
drop _merge 
winsor2 ap rect rectr recco recd cogs, replace 
replace rectr=rect if missing(rectr) 
replace invch=0 if missing(invch)
replace invch=. if invch==0 & missing(rectr, rect)

** Bring back data on Inventories: 
merge 1:1 gvkey fyear using "inventory_annual", keepusing(invt invfg invo invrm invwip) keep(1 3) 
drop _merge
winsor2 invt invfg invo invrm invwip, replace 

count 
save "__compustat_cleaned.dta", replace

